# install requirements
# !pip install -r requirements.txt
import plotly
plotly.offline.init_notebook_mode()
we will perform the EDA mainly searching for nulls and exploring the dataset variables and their relationships, we use the dtale library to perform most of the exploration, but we only keep the major highlights.
import pandas as pd
import dtale
flux_df = pd.read_csv('challenge_watershed/flux.csv') # parse_dates=['date']
flux_df.head()
| date | basin_id | flux | precip | temp_max | gauge_name | lat | lon | mean_elev | area_km2 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1980-01-01 | 1001001 | 0.579 | 0.0 | 10.685653 | Rio Caquena En Nacimiento | -18.0769 | -69.1961 | 4842.449328 | 49.711859 |
| 1 | 1980-01-02 | 1001001 | 0.543 | 0.0 | 11.470960 | Rio Caquena En Nacimiento | -18.0769 | -69.1961 | 4842.449328 | 49.711859 |
| 2 | 1980-01-03 | 1001001 | 0.482 | 0.0 | 11.947457 | Rio Caquena En Nacimiento | -18.0769 | -69.1961 | 4842.449328 | 49.711859 |
| 3 | 1980-01-04 | 1001001 | 0.459 | 0.0 | 12.424489 | Rio Caquena En Nacimiento | -18.0769 | -69.1961 | 4842.449328 | 49.711859 |
| 4 | 1980-01-05 | 1001001 | 0.436 | 0.0 | 12.649203 | Rio Caquena En Nacimiento | -18.0769 | -69.1961 | 4842.449328 | 49.711859 |
%matplotlib inline
# number of registers by station
print(flux_df[['gauge_name','basin_id']].value_counts())
# check id unique
# flux_df.groupby(['gauge_name','basin_id'])['basin_id'].nunique().sort_values()
#
flux_df[['gauge_name','basin_id']].value_counts().plot.hist(bins=100)
gauge_name basin_id
Rio Aconcagua En Chacabuquito 5410002 14670
Rio Cruces En Rucaco 10134001 14649
Rio Choapa En Cuncumen 4703002 14639
Rio Elqui En Algarrobal 4320001 14634
Rio Cautin En Cajon 9129002 14603
...
Estero Chimbarongo En Santa Cruz 6034001 328
Rio Chillan En Longitudinal 8117001 302
Estero Las Vegas Aguas Abajo Canal Las Vegas 5423002 195
Rio Pama Entrada Embalse Cogoti 4534001 195
Rio Blanco En Chaiten 10683002 175
Length: 503, dtype: int64
<AxesSubplot:ylabel='Frequency'>
As we can observe there are a few stations that had less than 2000 registers, that probably means that they have way less history than other stations, in the time series analysis we will probably will need to find a minimum common time-window where to use as much as possible from that data.
Given that we need to perform some outlier/extreme_value analysis we want to have a fixed "panel" of stations as much as homogeneous as possible, for as wider-as-possible period. This mainly to detect some changes in the trend of outliers, and removing some inherent bias for the newer/fewer stations. we can plot the data timeline of the stations.
import plotly.express as px
reg_period_df = flux_df.groupby('gauge_name', as_index=False).agg({'date':[min,max]})
reg_period_df.columns = ['_'.join(tup).rstrip('_') for tup in reg_period_df.columns.values]
reg_period_df.sort_values(by = ["date_min", "date_max"], inplace=True)
fig = px.timeline(reg_period_df, x_start="date_min", x_end="date_max", y="gauge_name", width=800, height=1400)
fig.show()
If we observe the timeline of registers by station, we observe that most of the stations have being recently added, most of them had being active the entire time, but some of them were deactivated a while ago. (this analysis is ignoring the missing data within the last and first register, but is still a good visualization to show the missing data).
We can use all the basin_id to fit a prediction model, but we need to be careful in the "change trend" analysis, specially because if we have a difference in the size/representation of the stations panel, we might misunderstand the effect of the time on the number of extreme events
# we can run a missing values analysis, but just to get if some variables are missing from the dataset in general
# nulls in dataset
print('Number of nulls by column')
print(flux_df.isnull().sum(axis = 0))
# and by basin_id
print('Missing data percentage by basin_id')
nulls_df = flux_df.groupby("basin_id").apply(lambda x: x.notnull().mean()).sort_values(by='precip')
dtale.show(nulls_df[['precip','temp_max']])
Number of nulls by column date 0 basin_id 0 flux 0 precip 5443 temp_max 5443 gauge_name 0 lat 0 lon 0 mean_elev 0 area_km2 0 dtype: int64 Missing data percentage by basin_id
Given that just the basin_id had some missing registers that are less than 3% it is possible to do some imputation; using (linear) interpolation to fill the values, ffill/bfill etc.
There are other values that might be missing as well, entire days for some basin_id. This will be particularly problematic on the prediction stage, nevertheless, we will do the "data imputation" on the prediction stage, and for simplicity we will drop the missing rows to understand other characteristics of the Dataset
We now see how the stations are distributed geographically, this is important because they are very different in latitude therefore, they probably present very different precipitations and temperatures, plotting all the stations will help us to see if there is an inherent deviation if we subsample the panel.
import plotly.express as px
flux_df['date_ym'] = flux_df['date'].str[:7]
flux_df['date_y'] = flux_df['date'].str[:4]
reg_month_df = flux_df.groupby(['basin_id', 'lat', 'lon','date_y'], as_index=False)['gauge_name'].count()
# reg_month_df
fig = px.scatter_geo(reg_month_df,
#locations="iso_alpha",
lat = 'lat',
lon = 'lon',
#color="continent",
hover_name="basin_id",
size="gauge_name",
animation_frame="date_y",
scope="south america",
width=500, height=700,
size_max=5
)
fig.show()
The distribution seems steady overtime, there is an increase of stations in the south and in the center overtime, but in terms of the amount of data points doesn't seem very relevant, that's good news because that probably will provide us a representative sample of all the latitudes
we add the requested functions
## adding the plot functions for the stations
from typing import Literal
import plotly.express as px
flux_df['date_ts'] = pd.to_datetime(flux_df['date'])
# first function
def plot_one_timeserie(cod_station:int, variable:Literal['flux','precip','temp_max'], min_date:str, max_date:str) -> None:
sub_df = flux_df[(flux_df['basin_id'] == cod_station) &
(flux_df['date'] >= min_date) &
(flux_df['date'] <= max_date)].copy()
fig = px.line(sub_df, x='date_ts', y=variable)
fig.show()
#test
plot_one_timeserie(cod_station=7355001,
variable='precip',
min_date='1998-01-01',
max_date='2022-12-12')
# second function
def plot_three_timeseries(cod_station, min_date, max_date):
sub_df = flux_df[(flux_df['basin_id'] == cod_station) &
(flux_df['date'] >= min_date) &
(flux_df['date'] <= max_date)][['flux','precip','temp_max','date_ts']].copy()
# normalize variables
cols = ['flux','precip','temp_max']
sub_df[cols]=(sub_df[cols]-sub_df[cols].min())/(sub_df[cols].max()-sub_df[cols].min())
fig = px.line(sub_df, x='date_ts', y=cols)
fig.show()
#test
plot_three_timeseries(cod_station=7355001,
min_date='1998-01-01',
max_date='2022-12-12')
we would like to know if the variables (['flux','precip','temp_max']) correspond to an extreme value (or, very unlikely value), there are several techniques to infer how unlikely the values that we are seeing, some of them more complex than others:
We can assume that all the observations from each variable (temperatures, precip, flux) come from one single distribution (by station), then we can identify how unlikely is one observation to estimate the probability of the posterior (or, even more simplistic, the percentile). The limitation of this approach is that we will prompt to ignore the seasonality effects, and an extreme temperature in winter might be just a regular temperature in summer, to solve that we can implement a second approach
We can split the observations into 4 season sets, and then use the same approach as (1). The limitation of this approach will appear on the borders of our sets because the seasons are just an arbitrary definition we are prompted to find outliers on the boundaries of our seasonal sets.
Another more complex approach is to use a model to predict the values trained on some past data (a few years), depending on how likely the observation from the prediction we flag it as an outlier, this will fix the seasonality problem, just because we can control by season (using the day of year or another variable to control for it). The limitation of this approach is that we will need to train the model at least with one part of the time series that we define as "normal", then from that point, we can predict and evaluate the probability.
There are many models that we can implement for outlier detection for time series, the simplest ones are moving averages or moving medians and other more sophisticated ts models
Particularly for this case, we will use a custom fit/predict using prophet, because it provides as a simple implementation whereas it also provides a probabilistic forecast approach.
for the simplicity of the exercise, we are assuming that the extreme values are "station"-wise anomalies. Considering that the climate and weather might affect several near-stations, and therefore there should be a correlation between these extreme observations on near-stations too. With this information, we might be able to "cross-fit" our anomaly detection model using not only one station but the stations near it. That means that we might be able to understand if several flags are raised in the neighborhood, is more likely that we are observing an extreme event.
For the sake of time, we will not deep dive into that
from typing import List
from prophet import Prophet
from prophet.diagnostics import generate_cutoffs
from others import suppress_stdout_stderr
cod_station = 5410002
sub_df = flux_df[(flux_df['basin_id'] == cod_station)][['flux','precip','temp_max','date_ts']].copy()
variable = 'flux'
ts_df = sub_df[['date_ts', variable]].copy()
ts_df.rename({'flux':'y',
'date_ts':'ds'}, axis=1, inplace=True)
baseline_train_yrs = 3
test_size = 365 * 2
horizon = pd.Timedelta(f'{test_size} days')
period = horizon
cuoffs_list = generate_cutoffs(ts_df,
horizon=horizon,
period=period,
initial=pd.Timedelta(f'{baseline_train_yrs*365} days'),
)
forecast_df_list:List[pd.DataFrame] = []
for cutoff_date in cuoffs_list:
fold_ts_df = ts_df[ts_df['ds']<=cutoff_date].copy()
with suppress_stdout_stderr():
model = Prophet(n_changepoints = 0,
weekly_seasonality=False,
interval_width=0.99
)
model.fit(fold_ts_df)
future = model.make_future_dataframe(periods=test_size)
forecast = model.predict(future)
# remove outliers from ts_df
next_fold_ds = ts_df[(ts_df['ds']> cutoff_date) &
(ts_df['ds']<= cutoff_date +horizon)].copy()
forecasted_fold_ds = pd.merge(next_fold_ds, forecast, how='left', on='ds')
forecasted_fold_ds['anomaly_flag'] = ((forecasted_fold_ds['y'] > forecasted_fold_ds['yhat_upper'])
# | (forecasted_fold_ds['y'] < forecasted_fold_ds['yhat_lower']
)
forecast_df_list.append(forecasted_fold_ds)
# remove outliers
ts_df.loc[ts_df['ds'].isin(forecasted_fold_ds[forecasted_fold_ds['anomaly_flag']]['ds']), 'y'] = None
print(f'{cutoff_date = } num_outliers = {forecasted_fold_ds["anomaly_flag"].sum()}')
outliers_df = pd.concat(forecast_df_list)
2022-10-24 17:34:38,364 - INFO - Making 18 forecasts with cutoffs between 1984-06-15 00:00:00 and 2018-06-07 00:00:00 2022-10-24 17:34:38,386 - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. 17:34:38 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:34:38,410 - INFO - Chain [1] start processing 17:34:38 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:34:38,507 - INFO - Chain [1] done processing 2022-10-24 17:34:38,935 - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. 17:34:39 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:34:39,110 - INFO - Chain [1] start processing
cutoff_date = Timestamp('1984-06-15 00:00:00') num_outliers = 1
17:34:39 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:34:39,177 - INFO - Chain [1] done processing 2022-10-24 17:34:39,704 - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. 17:34:39 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:34:39,735 - INFO - Chain [1] start processing 17:34:39 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:34:39,824 - INFO - Chain [1] done processing 17:34:39 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted 2022-10-24 17:34:39,826 - ERROR - Chain [1] error: error during processing Operation not permitted 2022-10-24 17:34:39,828 - WARNING - Optimization terminated abnormally. Falling back to Newton. 17:34:39 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:34:39,855 - INFO - Chain [1] start processing
cutoff_date = Timestamp('1986-06-15 00:00:00') num_outliers = 74
17:35:00 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:00,675 - INFO - Chain [1] done processing 2022-10-24 17:35:01,299 - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. 17:35:01 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:01,336 - INFO - Chain [1] start processing 17:35:01 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:01,446 - INFO - Chain [1] done processing
cutoff_date = Timestamp('1988-06-14 00:00:00') num_outliers = 0
2022-10-24 17:35:02,277 - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. 17:35:02 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:02,320 - INFO - Chain [1] start processing 17:35:02 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:02,450 - INFO - Chain [1] done processing 17:35:02 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted 2022-10-24 17:35:02,452 - ERROR - Chain [1] error: error during processing Operation not permitted 2022-10-24 17:35:02,454 - WARNING - Optimization terminated abnormally. Falling back to Newton.
cutoff_date = Timestamp('1990-06-14 00:00:00') num_outliers = 0
17:35:02 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:02,491 - INFO - Chain [1] start processing 17:35:03 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:03,898 - INFO - Chain [1] done processing 2022-10-24 17:35:04,671 - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. 17:35:04 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:04,719 - INFO - Chain [1] start processing 17:35:04 - cmdstanpy - INFO - Chain [1] done processing
cutoff_date = Timestamp('1992-06-13 00:00:00') num_outliers = 2
2022-10-24 17:35:04,862 - INFO - Chain [1] done processing 17:35:04 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted 2022-10-24 17:35:04,864 - ERROR - Chain [1] error: error during processing Operation not permitted 2022-10-24 17:35:04,866 - WARNING - Optimization terminated abnormally. Falling back to Newton. 17:35:04 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:04,908 - INFO - Chain [1] start processing 17:35:07 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:07,859 - INFO - Chain [1] done processing 2022-10-24 17:35:08,717 - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. 17:35:08 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:08,769 - INFO - Chain [1] start processing
cutoff_date = Timestamp('1994-06-13 00:00:00') num_outliers = 0
17:35:08 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:08,931 - INFO - Chain [1] done processing 17:35:08 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted 2022-10-24 17:35:08,933 - ERROR - Chain [1] error: error during processing Operation not permitted 2022-10-24 17:35:08,934 - WARNING - Optimization terminated abnormally. Falling back to Newton. 17:35:08 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:08,982 - INFO - Chain [1] start processing 17:35:14 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:14,400 - INFO - Chain [1] done processing 2022-10-24 17:35:15,465 - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. 17:35:15 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:15,528 - INFO - Chain [1] start processing
cutoff_date = Timestamp('1996-06-12 00:00:00') num_outliers = 57
17:35:15 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:15,693 - INFO - Chain [1] done processing 2022-10-24 17:35:16,739 - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. 17:35:16 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:16,807 - INFO - Chain [1] start processing
cutoff_date = Timestamp('1998-06-12 00:00:00') num_outliers = 0
17:35:16 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:16,982 - INFO - Chain [1] done processing 2022-10-24 17:35:18,231 - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. 17:35:18 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:18,305 - INFO - Chain [1] start processing
cutoff_date = Timestamp('2000-06-11 00:00:00') num_outliers = 41
17:35:18 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:18,519 - INFO - Chain [1] done processing 2022-10-24 17:35:19,721 - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
cutoff_date = Timestamp('2002-06-11 00:00:00') num_outliers = 77
17:35:19 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:19,952 - INFO - Chain [1] start processing 17:35:20 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:20,202 - INFO - Chain [1] done processing 2022-10-24 17:35:21,515 - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. 17:35:21 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:21,598 - INFO - Chain [1] start processing
cutoff_date = Timestamp('2004-06-10 00:00:00') num_outliers = 89
17:35:21 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:21,793 - INFO - Chain [1] done processing 2022-10-24 17:35:23,291 - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
cutoff_date = Timestamp('2006-06-10 00:00:00') num_outliers = 30
17:35:23 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:23,375 - INFO - Chain [1] start processing 17:35:23 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:23,667 - INFO - Chain [1] done processing 17:35:23 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted 2022-10-24 17:35:23,669 - ERROR - Chain [1] error: error during processing Operation not permitted 2022-10-24 17:35:23,670 - WARNING - Optimization terminated abnormally. Falling back to Newton. 17:35:23 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:23,746 - INFO - Chain [1] start processing 17:35:27 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:27,529 - INFO - Chain [1] done processing 2022-10-24 17:35:28,988 - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. 17:35:29 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:29,078 - INFO - Chain [1] start processing
cutoff_date = Timestamp('2008-06-09 00:00:00') num_outliers = 27
17:35:29 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:29,375 - INFO - Chain [1] done processing 17:35:29 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted 2022-10-24 17:35:29,376 - ERROR - Chain [1] error: error during processing Operation not permitted 2022-10-24 17:35:29,378 - WARNING - Optimization terminated abnormally. Falling back to Newton. 17:35:29 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:29,463 - INFO - Chain [1] start processing 17:35:33 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:33,953 - INFO - Chain [1] done processing 2022-10-24 17:35:35,631 - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. 17:35:35 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:35,726 - INFO - Chain [1] start processing
cutoff_date = Timestamp('2010-06-09 00:00:00') num_outliers = 1
17:35:36 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:36,009 - INFO - Chain [1] done processing 2022-10-24 17:35:37,628 - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
cutoff_date = Timestamp('2012-06-08 00:00:00') num_outliers = 0
17:35:37 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:37,863 - INFO - Chain [1] start processing 17:35:38 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:38,195 - INFO - Chain [1] done processing 17:35:38 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted 2022-10-24 17:35:38,197 - ERROR - Chain [1] error: error during processing Operation not permitted 2022-10-24 17:35:38,199 - WARNING - Optimization terminated abnormally. Falling back to Newton. 17:35:38 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:38,294 - INFO - Chain [1] start processing 17:35:43 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:43,516 - INFO - Chain [1] done processing 2022-10-24 17:35:45,193 - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. 17:35:45 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:45,298 - INFO - Chain [1] start processing
cutoff_date = Timestamp('2014-06-08 00:00:00') num_outliers = 5
17:35:45 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:45,630 - INFO - Chain [1] done processing 2022-10-24 17:35:47,702 - INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. 17:35:47 - cmdstanpy - INFO - Chain [1] start processing 2022-10-24 17:35:47,843 - INFO - Chain [1] start processing
cutoff_date = Timestamp('2016-06-07 00:00:00') num_outliers = 1
17:35:48 - cmdstanpy - INFO - Chain [1] done processing 2022-10-24 17:35:48,335 - INFO - Chain [1] done processing
cutoff_date = Timestamp('2018-06-07 00:00:00') num_outliers = 0
# we plot the outliers for a particular time series
import plotly.graph_objs as go
outliers_df = pd.concat(forecast_df_list)
fig = go.Figure([
go.Scatter(
name='Measurement',
x=outliers_df['ds'],
y=outliers_df['y'],
mode='lines',
line=dict(color='rgb(31, 119, 180)'),
),
go.Scatter(
name='Upper Bound',
x=outliers_df['ds'],
y=outliers_df['yhat_upper'],
mode='lines',
marker=dict(color="#444"),
line=dict(width=0),
showlegend=False
),
go.Scatter(
name='Lower Bound',
x=outliers_df['ds'],
y=outliers_df['yhat_lower'],
marker=dict(color="#444"),
line=dict(width=0),
mode='lines',
fillcolor='rgba(68, 68, 68, 0.3)',
fill='tonexty',
showlegend=False
),
go.Scatter(
name = 'Outliers',
x=outliers_df[outliers_df['anomaly_flag']]['ds'],
y=outliers_df[outliers_df['anomaly_flag']]['y'],
line=dict(color='red'),
mode='markers'
),
])
fig.update_layout(title=f'Anomaly Flag, basin_id = {cod_station}, var = {variable}')
fig.show()
This routine is implemented in anomaly.py and the plot results from the outlier detection are stored in anomaly_plots/.
flux_extreme¶Using the variable flux_extreme we will be loading some images and see how the results look like
The "Rio Loa" Watershed seems for example to have a drought since the 90, nevertheless near 2003 the effect seem to be reversed, this particular time-series tends to have a huge variability, some strong meteorological events had increased the flux on the watershed
On the other hand a station from the Center of chile the anomalies trend tends to be more widely spread across the years
In the most southern part we have a similar trend as the center of Chile
Is not very easy to spot what are the main differences considering the amount of different times-span, location, among other variables that might be influencing the flux between different stations, Another alternative will be plotting the percentage of events that we have in the different watersheds, this aggregated plot might signal some trend change on the country.
we will do that using the number of flagged extreme events divided by the number of total registers, then we aggregate it in a daily df
# Lets plot the percentage of extreme events
# we will do that using the number of flagged extreme events divided by the number of total registers, then we aggregate it in a daily df
anomaly_df = pd.read_csv('anomaly_flag.csv.gz', compression='gzip',parse_dates=['date_ts']) # this dataset is the output from the `anomaly.py` script
anomaly_df.head()
| basin_id | date_ts | flux_extreme | precip_extreme | temp_max_extreme | |
|---|---|---|---|---|---|
| 0 | 1001001 | 1989-05-30 | False | False | False |
| 1 | 1001001 | 1989-05-31 | False | False | False |
| 2 | 1001001 | 1989-06-01 | False | False | False |
| 3 | 1001001 | 1989-06-02 | False | False | False |
| 4 | 1001001 | 1989-06-03 | False | False | False |
import plotly.express as px
flux_extreme_pct_df = anomaly_df.groupby(['date_ts'], as_index=False)['flux_extreme'].mean()
fig = px.scatter(flux_extreme_pct_df, x='date_ts',
y='flux_extreme',
trendline="ols",
trendline_color_override="red")
fig.show()
results = px.get_trendline_results(fig)
results = results.iloc[0]["px_fit_results"].summary()
print(results)
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.003
Model: OLS Adj. R-squared: 0.003
Method: Least Squares F-statistic: 38.61
Date: Mon, 24 Oct 2022 Prob (F-statistic): 5.33e-10
Time: 17:35:53 Log-Likelihood: 16878.
No. Observations: 13626 AIC: -3.375e+04
Df Residuals: 13624 BIC: -3.374e+04
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 0.0721 0.002 38.650 0.000 0.068 0.076
x1 -1.098e-11 1.77e-12 -6.213 0.000 -1.44e-11 -7.52e-12
==============================================================================
Omnibus: 8201.903 Durbin-Watson: 0.165
Prob(Omnibus): 0.000 Jarque-Bera (JB): 76264.667
Skew: 2.833 Prob(JB): 0.00
Kurtosis: 13.111 Cond. No. 3.28e+09
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.28e+09. This might indicate that there are
strong multicollinearity or other numerical problems.
From what we can observe from the upper results the trend of flux_extreme events is decreasing over time, this is also significant (as we can see from the x1 pval < 0.05). This is very important because it not only indicates a change in trend over time but also is significant.
However, there is a limitation to our approach; we are not fixing a panel or using a steady random sample from the "extreme_events distribution". Our approach consists in simplify this problem by just focusing on the percentage of extreme events, while this is a useful simplification might ignore the problem of too-small or too biased sample to determine the change in trend - for example, we might be observing a 0.5 of extreme event percentage based on 2 stations, or 0 based on 1 station -
There is a second method that we can use to test our trend change hypothesis using bayesian inference. In a nutshell the method will be analyzed if there is a change in trend but using a sample level assumption. Using a couple of distributions and priors we might be able to test if there has been a change in the parameter of the distribution that generates the extreme events.
Assuming that each extreme event (from each basin) is a random sample of a Bernoulli distribution with probability $p_w(t)$ $w \in Watersheds$
$$\mathbb{P}(extreme\_flux_{w}) \sim B(p_w(t)) $$we can add some heterogeneity to the prior distribution of the basin_id i and use some fixed effects. we can use a trunc normal distribution with bounds [0,1]
where $\beta_w$ will be our heterogeneity parameter for the watershed $w$. With that model, we will be able to test if the $\beta_t$ is or is not significant (or "credible") and if the posterior of the $\beta_w$ contains or not the 0.
For the sake of time and given that for our analysis the LM model already shows significance in the decreasing trend, we will not implement this approach, however, it might be a useful tool to understand and test more precise if there is a change in trend
Finally, merge the anomaly_df with the original dataset
cols_to_use = list(flux_df.columns.difference(anomaly_df.columns))
keys = ['basin_id','date_ts']
flux_flag_df = pd.merge(anomaly_df, flux_df[cols_to_use + keys ], on=keys, how='left')
flux_flag_df.info(show_counts=True)
<class 'pandas.core.frame.DataFrame'> Int64Index: 3416586 entries, 0 to 3416585 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 basin_id 3416586 non-null int64 1 date_ts 3416586 non-null datetime64[ns] 2 flux_extreme 3416586 non-null bool 3 precip_extreme 3416586 non-null bool 4 temp_max_extreme 3416586 non-null bool 5 area_km2 3416586 non-null float64 6 date 3416586 non-null object 7 date_y 3416586 non-null object 8 date_ym 3416586 non-null object 9 flux 3416586 non-null float64 10 gauge_name 3416586 non-null object 11 lat 3416586 non-null float64 12 lon 3416586 non-null float64 13 mean_elev 3416586 non-null float64 14 precip 3416586 non-null float64 15 temp_max 3416586 non-null float64 dtypes: bool(3), datetime64[ns](1), float64(7), int64(1), object(4) memory usage: 374.7+ MB
We first merge the anomaly_df with the data from the original dataset, but only keeping the labeled data, this will remove the missing points and the historical data that we use to detect the extreme values
flux_extreme prediction¶we would like to add many features that we can build from the dataset, some of them are the following ones:
ft_rolling_sum_<var>_extreme_{t}: rolling sum of <var> $\in$ ['flux', 'precip', 'temp_max'] extreme events from the last t daysft_rolling_avg_<var>_extreme_{t}: rolling avg of <var> $\in$ ['flux', 'precip', 'temp_max'] extreme events from the last t daysft_rolling_avg_<var>_{t}: rolling avg of <var> $\in$ ['flux', 'precip', 'temp_max'] from the last t daysft_rolling_sum_<var>_{t}: rolling sum of <var> $\in$ ['flux', 'precip', 'temp_max'] from the last t daysft_basin_lat_trunc: truncated latitudft_basin_lng_trunc: truncated longft_num_month: numeric month categoryft_area_km2: from the original datasetft_mean_elev: from the original datasetAdditionally, we could add lag variables and compounded such as the difference of variables vs the same variable 1year ago etc, but for the sake of simplicity and time we only keep the basic ones on this list
# feature creation
# 1. `ft_rolling_sum_<var>_extreme_{t}`
# 2. `ft_rolling_avg_<var>_extreme_{t}`
# 3. `ft_rolling_avg_<var>_{t}`
# 4. `ft_rolling_sum_<var>_{t}`
flux_ft_df = flux_flag_df.copy()
flux_ft_df.sort_values(by = ['basin_id', 'date_ts'], ascending=True, inplace=True)
for var in ['flux', 'precip', 'temp_max']:
for d_period in [1,3,7,14,28,100,365]:
flux_ft_df[f'ft_rolling_sum_{var}_d{d_period}'] = flux_ft_df.groupby('basin_id', as_index=False)[var].rolling(d_period).sum()[var]
flux_ft_df[f'ft_rolling_avg_{var}_d{d_period}'] = flux_ft_df.groupby('basin_id', as_index=False)[var].rolling(d_period).mean()[var]
flux_ft_df[f'ft_rolling_sum_{var}_extreme_d{d_period}'] = flux_ft_df.groupby('basin_id', as_index=False)[f'{var}_extreme'].rolling(d_period).sum()[f'{var}_extreme']
flux_ft_df[f'ft_rolling_avg_{var}_extreme_d{d_period}'] = flux_ft_df.groupby('basin_id', as_index=False)[f'{var}_extreme'].rolling(d_period).mean()[f'{var}_extreme']
# 5. `ft_basin_lat_trunc`
# 6. `ft_basin_lng_trunc`
flux_ft_df['ft_basin_lon_trunc'] = flux_ft_df['lon'].round(1) # test with len(flux_ft_df['lon'].round(1).unique())
flux_ft_df['ft_basin_lon_trunc'] = flux_ft_df['lat'].round(1) # test with len(flux_ft_df['lon'].round(1).unique())
# 7. `ft_num_month`
flux_ft_df['ft_num_month'] = flux_ft_df['date_ts'].dt.month
# 8. `ft_area_km2`
flux_ft_df['ft_area_km2'] = flux_ft_df['area_km2'].round(0) # remove some variability
# 9. `ft_mean_elev`
flux_ft_df['ft_mean_elev'] = flux_ft_df['mean_elev'].round(0)
# replace nans in features
features = [col for col in flux_ft_df.columns if 'ft_' in col]
flux_ft_df[features] = flux_ft_df[features].fillna(-100) # lower than the min temperature
Our prediction horizon will be the number of days before the event that we will use to announce that an extreme event will occur. This horizon will be entangled with the use of our prediction, given that we are predicting some extreme flux events on a watershed probably we are interested in a humanitarian evacuation or damage mitigation if we are thinking of places that are closer to the rivers/lakes that might be flooded.
Our flux_extreme variable is not (only) the first day of the -extreme- event, but (probably) will flag the following days as well. In our hypothetical application, we are concerned about the first day of the event, but we could extend that to understand if the flux event will still be happening in the following 3 days, the duration of the event might be significant as well to understand a possible evacuation.
Assuming that application probably we are looking into 2 or 3 days in advance, so maybe some measures can be applied. We will use 3 days just to define some horizon, this target, plus, the features defined in the first step, will create our master_df dataframe
features = [col for col in flux_ft_df.columns if 'ft_' in col]
keys = ['date_ts'] # we will ignore basin_id
target = ['flux_extreme_3d']
# generate target
flux_ft_df['flux_extreme_3d'] = flux_ft_df.groupby('basin_id', as_index=False)['flux_extreme'].shift(-3)
master_df = flux_ft_df[target+keys+features].copy()
master_df.dropna(inplace=True)
# we will apply a timesplit because we will be always predicting future datasets with historical data, therefore will be more
# accurate on a real test scenario for our model
cutoff_date = '2019-01-01' # this will be our cutoff date for the train/test split this test will not be used in cv
test_df = master_df[master_df['date_ts']>cutoff_date].copy()
train_df = master_df[master_df['date_ts']<=cutoff_date].copy()
target_balance = master_df['flux_extreme_3d'].mean()
print(f'{target_balance = }')
print(f'num of features = {len(features)}')
target_balance = 0.06078741052020088 num of features = 88
As we observe our dataset is very unbalanced (0.06 P) and we would like to fit a model in order to predict the target. We would use several evaluation metrics but our main metric will be the AUCPR given that is a normalized indicator and also its good when we are dealing with unbalance datasets
We will start sampling our dataset to choose the model algorithm and to perform feature selection we will then search for some parameters improvement based on some cross validation metrics and finally fit our model.
In this analysis we will use TimeSeriesSplit because our real scenario will always be facing futures events with historical data.
# build sample from dataset for performance improvement, we will use the final train_df with the final model
train_df.sort_values(by= 'date_ts', inplace=True)
train_sample_df = train_df.groupby(target).apply(lambda x: x.sample(frac=0.1))
print(train_sample_df.shape)
print(train_sample_df[target].mean())
train_sample_df.sort_values(by = 'date_ts', inplace=True)
(330816, 90) flux_extreme_3d 0.061487 dtype: float64
We will train a list of models in order to search for our week learner algorithm (based on performace and time) and also to search for our preferred final modeling algorithm (based on performace)
# %%script false
import time
import numpy as np
from sklearn import metrics
from sklearn import model_selection
from sklearn.feature_selection import RFECV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegressionCV # baseline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier # XGB
from sklearn.ensemble import HistGradientBoostingClassifier # LGBM
# we test using vanilla parameters
models = {'RCV':RidgeClassifierCV(),
'DTree':DecisionTreeClassifier(),
'LOGIT':LogisticRegressionCV(),
'RFC':RandomForestClassifier(),
'ABC':AdaBoostClassifier(),
'XGB':GradientBoostingClassifier(),
'LGBM':HistGradientBoostingClassifier(),
}
y_vector = train_sample_df[target].astype(int).values.ravel()
x_matrix = train_sample_df[features].values
# ignore warnings
import warnings
warnings.filterwarnings("ignore")
tscv = model_selection.TimeSeriesSplit(n_splits = 3)
model_results = []
for mdl in models.keys():
init_time = time.time()
scores = model_selection.cross_validate(models[mdl],
x_matrix,
y_vector,
scoring=['average_precision', # AUCPR https://stats.stackexchange.com/q/157012/274422
'roc_auc', # AUC
'f1',
#'recall', # recall
#'precision',
],
cv=tscv)
aucpr = np.nanmean(scores['test_average_precision'])
auc = np.nanmean(scores['test_roc_auc'])
f1 = np.nanmean(scores['test_f1'])
elapsed_time = time.time() - init_time
model_results.append({'aucpr':aucpr,
'auc':auc,
'f1':f1,
'elapsed_time': elapsed_time,
'name':mdl})
print(f"{mdl}, {aucpr = :.3f}, {auc = :.3f}, {f1 = :.3f}, {elapsed_time = :.1f}s")
results_df = pd.DataFrame(model_results)
RCV, aucpr = 0.722, auc = 0.928, f1 = 0.670, elapsed_time = 7.0s DTree, aucpr = 0.282, auc = 0.751, f1 = 0.501, elapsed_time = 42.0s LOGIT, aucpr = 0.581, auc = 0.909, f1 = 0.471, elapsed_time = 226.9s RFC, aucpr = 0.728, auc = 0.942, f1 = 0.665, elapsed_time = 257.1s ABC, aucpr = 0.712, auc = 0.942, f1 = 0.647, elapsed_time = 126.4s XGB, aucpr = 0.739, auc = 0.947, f1 = 0.673, elapsed_time = 653.4s LGBM, aucpr = 0.744, auc = 0.951, f1 = 0.671, elapsed_time = 33.5s
From the upper results we can infer that LBGM is probably our choose for the final model and we will use RidgeClassifierCV as our week learner to perform the feature selection
we will search for a set of features based on the week learner RCV, then we will use that feature subset to perform some manual parameter search and then fit the final model
# ================================ #
# ====== feature selection ====== #
# ================================ #
import matplotlib.pyplot as plt
y_vector = train_sample_df[target].astype(int).values.ravel()
x_matrix = train_sample_df[features].values
warnings.filterwarnings("ignore")
rfecv = RFECV(estimator=RidgeClassifierCV(),
step=1,
cv=3,
n_jobs=10,
verbose=0,
scoring='average_precision')
rfecv.fit(x_matrix, y_vector)
print("Optimal number of features : %d" % rfecv.n_features_)
feat_selection = []
for i,feature in enumerate(features):
feat_selection.append({'name': feature,
'selected':rfecv.support_[i],
'ranking':rfecv.ranking_[i]})
feat_selection = pd.DataFrame(feat_selection)
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (accuracy)")
plt.plot(range(0, len(rfecv.grid_scores_)),rfecv.grid_scores_,)
plt.show()
Optimal number of features : 66
selected_features = list(feat_selection[feat_selection.ranking<=1].name.unique())
print(len(selected_features))
66
we test some parameter combinations based on the last results, and then we keep the best combination to the final fit, we also use the already filtered features selected_features
# we test some combinations of parameters and check
models = {'LGBM_v1':HistGradientBoostingClassifier(),
'LGBM_v2':HistGradientBoostingClassifier(max_iter=50,max_leaf_nodes=10,max_depth=5, l2_regularization=0),
'LGBM_v3':HistGradientBoostingClassifier(max_iter=100,max_leaf_nodes=5,max_depth=10, l2_regularization=0),
'LGBM_v4':HistGradientBoostingClassifier(max_iter=100,max_leaf_nodes=15,max_depth=15, l2_regularization=0.5),
'LGBM_v5':HistGradientBoostingClassifier(max_iter=10,max_leaf_nodes=30,max_depth=20, l2_regularization=0.5),
'LGBM_v5':HistGradientBoostingClassifier(l2_regularization=0.5),
}
y_vector = train_sample_df[target].astype(int).values.ravel()
x_matrix = train_sample_df[selected_features].values
# ignore warnings
import warnings
warnings.filterwarnings("ignore")
tscv = model_selection.TimeSeriesSplit(n_splits = 3)
model_results = []
for mdl in models.keys():
init_time = time.time()
scores = model_selection.cross_validate(models[mdl],
x_matrix,
y_vector,
scoring=['average_precision', # AUCPR https://stats.stackexchange.com/q/157012/274422
'roc_auc', # AUC
'f1',
],
cv=tscv)
aucpr = np.nanmean(scores['test_average_precision'])
auc = np.nanmean(scores['test_roc_auc'])
f1 = np.nanmean(scores['test_f1'])
elapsed_time = time.time() - init_time
model_results.append({'aucpr':aucpr,
'auc':auc,
'f1':f1,
'elapsed_time': elapsed_time,
'name':mdl})
print(f"{mdl}, {aucpr = :.3f}, {auc = :.3f}, {f1 = :.3f}, {elapsed_time = :.1f}s")
results_df = pd.DataFrame(model_results)
LGBM_v1, aucpr = 0.742, auc = 0.949, f1 = 0.671, elapsed_time = 29.8s LGBM_v2, aucpr = 0.740, auc = 0.948, f1 = 0.668, elapsed_time = 9.6s LGBM_v3, aucpr = 0.739, auc = 0.948, f1 = 0.670, elapsed_time = 11.5s LGBM_v4, aucpr = 0.746, auc = 0.950, f1 = 0.674, elapsed_time = 20.9s LGBM_v5, aucpr = 0.744, auc = 0.950, f1 = 0.671, elapsed_time = 31.9s
we fit the selected model with the selected features on the overall train dataframe
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import average_precision_score, roc_auc_score
y_vector = train_df[target].astype(int).values.ravel()
x_matrix = train_df[selected_features].values
y_test = test_df[target].astype(int).values.ravel()
x_test = test_df[selected_features].values
model = HistGradientBoostingClassifier(l2_regularization=0.5) #LGBM
y_scores = cross_val_predict(model, x_matrix, y_vector, cv=3, method='decision_function' )
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
precisions_70_recall = precisions[np.argmin(recalls >= 0.70)]
threshold_70_recall = thresholds[np.argmin(recalls >= 0.70)]
plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
plt.xlabel("Threshold")
plt.plot([threshold_70_recall, threshold_70_recall], [0., 0.7], "r:")
plt.axis([-5, 5, 0, 1])
plt.plot([-5, threshold_70_recall], [0.7, 0.7], "r:")
plt.plot([-5, threshold_70_recall], [precisions_70_recall, precisions_70_recall], "r:")
plt.plot([threshold_70_recall], [0.7], "ro")
plt.plot([threshold_70_recall], [precisions_70_recall], "ro")
plt.grid(True)
plt.legend()
plt.show()
print(f' 70% recall: {threshold_70_recall = }, {precisions_70_recall = }')
precisions, recalls, thresholds = precision_recall_curve(y_vector, y_scores)
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
70% recall: threshold_70_recall = -1.032380449859248, precisions_70_recall = 0.6637067076865707
once that the model is fitted we can estimate the score threshold that we will need to use in order to "detect" at least 70% of the flux_extreme events, this is the same as ask for a 70% recall. To do that we will need to use a score_threshold of -1.0237 and that will give us a precision of 0.66.
As you can observe in the upper plot there is a trade-off between the precision and recall, usually a higher recall, comes with a decrease in precision.
As we explain in the model fitting chapter (7) we will be using AUCPR for the evaluation of this model, we perform the scoring using the test_df dataframe.
fitted_model = model.fit(x_matrix, y_vector)
y_scores_test = fitted_model.predict_proba(x_test)[:, 1]
aucpr_score = average_precision_score(y_test, y_scores_test)
auc_score = roc_auc_score(y_test, y_scores_test)
print(f'{aucpr_score = :.3f} {auc_score = :.3f}')
aucpr_score = 0.737 auc_score = 0.962
to determinate the features importance we use the permutation feature importance described on [Sklearn Documentation] (https://scikit-learn.org/stable/modules/permutation_importance.html#permutation-feature-importance)
There is a strong criticizing of using the traditional MDI metrics to determinate the feature importance. Ref
from sklearn.inspection import permutation_importance
r = permutation_importance(fitted_model, x_matrix, y_vector,
n_repeats=3,
random_state=0)
for i in r.importances_mean.argsort()[::-1]:
if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
print(f"{selected_features[i]:<8} "
f"{r.importances_mean[i]:.3f}"
f" +/- {r.importances_std[i]:.3f}")
ft_rolling_sum_flux_extreme_d1 0.005 +/- 0.000 ft_rolling_sum_temp_max_d1 0.003 +/- 0.000 ft_rolling_sum_flux_extreme_d3 0.002 +/- 0.000 ft_rolling_sum_flux_extreme_d7 0.002 +/- 0.000 ft_rolling_sum_precip_d1 0.001 +/- 0.000 ft_rolling_sum_flux_extreme_d28 0.001 +/- 0.000 ft_rolling_avg_precip_d365 0.001 +/- 0.000 ft_rolling_sum_flux_extreme_d365 0.001 +/- 0.000 ft_rolling_sum_flux_d1 0.001 +/- 0.000 ft_rolling_sum_precip_d7 0.001 +/- 0.000 ft_rolling_sum_flux_extreme_d14 0.001 +/- 0.000 ft_rolling_sum_precip_d3 0.001 +/- 0.000 ft_rolling_sum_precip_extreme_d1 0.000 +/- 0.000 ft_rolling_avg_temp_max_d14 0.000 +/- 0.000 ft_rolling_avg_precip_d28 0.000 +/- 0.000 ft_rolling_avg_temp_max_d365 0.000 +/- 0.000 ft_rolling_sum_flux_extreme_d100 0.000 +/- 0.000 ft_rolling_avg_temp_max_d28 0.000 +/- 0.000 ft_rolling_sum_flux_d14 0.000 +/- 0.000 ft_rolling_avg_precip_d100 0.000 +/- 0.000 ft_rolling_sum_temp_max_d7 0.000 +/- 0.000 ft_rolling_sum_temp_max_d3 0.000 +/- 0.000 ft_rolling_sum_precip_extreme_d3 0.000 +/- 0.000 ft_rolling_avg_temp_max_d100 0.000 +/- 0.000 ft_rolling_sum_precip_extreme_d100 0.000 +/- 0.000 ft_rolling_avg_precip_d14 0.000 +/- 0.000 ft_rolling_avg_precip_extreme_d365 0.000 +/- 0.000 ft_rolling_sum_precip_extreme_d28 0.000 +/- 0.000 ft_rolling_sum_temp_max_extreme_d365 0.000 +/- 0.000 ft_rolling_sum_temp_max_extreme_d100 0.000 +/- 0.000 ft_rolling_sum_precip_extreme_d7 0.000 +/- 0.000 ft_rolling_sum_precip_extreme_d14 0.000 +/- 0.000 ft_rolling_sum_flux_d7 0.000 +/- 0.000 ft_rolling_sum_temp_max_extreme_d28 0.000 +/- 0.000 ft_rolling_sum_flux_d3 0.000 +/- 0.000 ft_rolling_sum_temp_max_extreme_d7 0.000 +/- 0.000
fig, ax = plt.subplots(figsize=(10, 6))
features_importances = pd.Series(r.importances_mean, index=selected_features)
features_importances.sort_values(inplace=True, ascending=False)
features_importances.plot.bar(yerr=r.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()
Given that we are predicting the occurrence of extreme flux events there are a few things that might be prompting these very good results, I probably will focus on the fact that there is a strong probability that a flux_extreme event will last more than a few days, therefore our model might be very good at predicting what is the probability of a consecutive day of event rather than predicting what is the probability of an event occurring for the first day. Changing the target variable with that end probably will provide a more useful model and more realistic one.
Given the time constraints of this assignment, it was not possible to test the hypothesis of the decrease in flux_extreme events in a robust manner, we did have a signal from the lineal model that uses the percentage of events but is possible that we will need to use a more robust methodology to that end. The methodology proposed provides a solution in that direction
Also, we didn't perform a very accurate or robust hyperparameter optimization on the final "classificator". Is very common that this hyperparameter search only improves the model marginally compared to better feature eng or data selection.